========================================================
# global setting for this analysis
library(knitr)
opts_chunk$set(fig.width=12, fig.height=8,
warning=FALSE, message=FALSE)
# Load all of the packages that you end up using
# in your analysis in this code chunk.
suppressMessages(library(ggplot2))
suppressMessages(library(maps))
suppressMessages(library(dplyr))
suppressMessages(library(grid))
suppressMessages(library (gridExtra))
suppressMessages(library(GGally))
suppressMessages(library(scales))
data(state)
setwd("~/Downloads")
us_startups <- read.csv("crunchbase_monthly_export_companies_1990_to_2014_us.csv")
dim(us_startups)
## [1] 22888 12
names(us_startups)
## [1] "market" "funding_total_usd" "status"
## [4] "country_code" "state_code" "region"
## [7] "city" "funding_rounds" "founded_at"
## [10] "founded_month" "founded_quarter" "founded_year"
str(us_startups)
## 'data.frame': 22888 obs. of 12 variables:
## $ market : Factor w/ 664 levels ""," 3D "," 3D Printing ",..: 568 276 458 181 55 55 386 55 1 276 ...
## $ funding_total_usd: Factor w/ 7153 levels " - ","1,000",..: 1460 5535 1 3440 6250 4729 5585 2836 2514 6095 ...
## $ status : Factor w/ 4 levels "","acquired",..: 2 4 4 4 2 4 3 4 4 4 ...
## $ country_code : Factor w/ 1 level "USA": 1 1 1 1 1 1 1 1 1 1 ...
## $ state_code : Factor w/ 52 levels "","AK","AL","AR",..: 21 6 45 12 46 11 6 22 40 21 ...
## $ region : Factor w/ 242 levels "AK - Other","Akron - Canton",..: 27 195 16 14 193 217 205 233 175 27 ...
## $ city : Factor w/ 2004 levels "","Abilene","Acton",..: 114 265 80 24 1548 193 1565 143 1375 1772 ...
## $ funding_rounds : int 1 2 1 2 2 1 1 9 1 3 ...
## $ founded_at : Factor w/ 2401 levels "1/1/00","1/1/01",..: 16 16 695 16 16 16 1228 16 16 16 ...
## $ founded_month : Factor w/ 269 levels "1990-01","1990-03",..: 1 1 7 1 1 1 3 1 1 1 ...
## $ founded_quarter : Factor w/ 99 levels "1990-Q1","1990-Q2",..: 1 1 4 1 1 1 2 1 1 1 ...
## $ founded_year : int 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
summary(us_startups)
## market funding_total_usd status
## Software : 2435 - : 3433 : 402
## Biotechnology : 1863 1,000,000: 400 acquired : 2190
## : 903 100,000 : 358 closed : 1156
## Mobile : 855 500,000 : 339 operating:19140
## Curated Web : 774 2,000,000: 298
## Enterprise Software : 738 250,000 : 281
## (Other) :15320 (Other) :17779
## country_code state_code region
## USA:22888 CA :8154 SF Bay Area : 5738
## NY :2465 New York City : 2215
## MA :1503 Boston : 1426
## TX :1145 Los Angeles : 1133
## WA : 799 Seattle : 767
## FL : 758 Washington, D.C.: 613
## (Other):8064 (Other) :10996
## city funding_rounds founded_at founded_month
## San Francisco: 2275 Min. : 1.000 1/1/11 : 1389 2011-01: 1462
## New York : 2008 1st Qu.: 1.000 1/1/12 : 1271 2012-01: 1355
## Palo Alto : 500 Median : 1.000 1/1/10 : 1250 2010-01: 1302
## Austin : 479 Mean : 1.988 1/1/09 : 1123 2009-01: 1150
## Seattle : 466 3rd Qu.: 2.000 1/1/07 : 966 2007-01: 982
## Chicago : 439 Max. :18.000 1/1/08 : 897 2013-01: 927
## (Other) :16721 (Other):15992 (Other):15710
## founded_quarter founded_year
## 2011-Q1: 1716 Min. :1990
## 2012-Q1: 1663 1st Qu.:2005
## 2010-Q1: 1473 Median :2009
## 2009-Q1: 1255 Mean :2008
## 2013-Q1: 1162 3rd Qu.:2011
## 2007-Q1: 1072 Max. :2014
## (Other):14547
# calculate number for each market
markets <- as.data.frame(table(us_startups$market))
# scatter plot with different size representing size of markets
p0 <- ggplot(aes(x = Var1, y = Freq), data = markets)
p0_1 <- p0 + geom_point(aes(size = Freq), color = "pink") +
geom_text(aes(label = Var1, size = Freq)) +
theme_bw()
p0_1
# adjust y scale to see more details
p0_1 + scale_y_log10()
# total fundings
# convert factor to be numeric
us_startups$funding_total_usd <- as.character(us_startups$funding_total_usd)
us_startups$funding_total_usd <- as.numeric(gsub(",", "", us_startups$funding_total_usd))
p1 <- ggplot(aes(x = funding_total_usd), data = subset(us_startups, !is.na(funding_total_usd) & status != ""))
# histogram with default seeting
p1_1 <- p1 + geom_histogram()
p1_1
# transform the long tail data to better understand the distribution of fundings
p1_1 + scale_x_log10()
# freqpoly
p1 + geom_freqpoly() + scale_x_log10()
# adjust binwidth to see detail
p1 + geom_histogram(binwidth = 0.2) +
scale_x_log10(breaks = c(1e+03, 1e+04, 1e+05, 1e+06, 1e+07, 1e+08, 1e+09))
# check if any difference for startups with diferent status
p1 + geom_histogram(aes(x = funding_total_usd, y = ..density.., fill = status), binwidth = 0.2) +
labs(x = "Total fundings ($USD)", y = "Count") +
ggtitle("Histogram of total fundings for startups ($USD, log10)") +
scale_x_log10(breaks = c(1e+04, 1e+06, 1e+08)) +
facet_wrap(~ status)
p2 <- ggplot(aes(x = funding_rounds), data = us_startups)
# histogram with default seeting
p2 + geom_histogram()
# adjust binwidth
p2 + geom_histogram(binwidth = 1) +
scale_x_discrete(breaks = seq(1, 16, 1))
# set x and y scale to see underpresented data
p2 + geom_histogram(binwidth = 1) +
coord_cartesian (xlim = c(5, 15), ylim = c(0, 1000))
p3 <- ggplot(aes(x = founded_year), data = us_startups)
# histogram of founed years
p3 + geom_histogram() +
scale_x_discrete(breaks = seq(1990, 2014, 1)) +
coord_cartesian(xlim = c(1989, 2015))
# get months from founded_at factor
us_startups$founded_at <- as.character(us_startups$founded_at)
us_startups$founded_month <- strftime(as.Date(us_startups$founded_at, "%m/%d/%Y"),"%m")
us_startups$founded_month <- as.numeric(us_startups$founded_month)
# histogram of founded months
p4 <- ggplot(aes(x = founded_month), data = us_startups)
p4 + geom_histogram(binwidth = 1)
# get days from founded_at factor
us_startups$founded_day <- strftime(as.Date(us_startups$founded_at, "%m/%d/%Y"),"%d")
us_startups$founded_day <- as.numeric(us_startups$founded_day)
# histogram of founded days
p5 <- ggplot(aes(x = founded_day), data = us_startups)
p5 + geom_histogram(binwidth = 1) +
scale_x_discrete(breaks = seq(0, 31, 1))
# distributions in states: which state has more startups?
# load us map data
us_state_map <- map_data("state")
states <- data.frame(state.center, state.abb)
# calculate frequency of startups in each state
regions <- as.data.frame(table(us_startups$state_code))
# merge data of startups with states
colnames(regions)[1] <- "state.abb"
regions <- merge(states, regions, by = "state.abb", all = TRUE)
# draw us map
p6 <- ggplot(aes(x = long, y = lat, group = group), data = us_state_map)
# use size of points to represent number of startups in each state
p6 + geom_polygon(fill = "white") +
geom_path(colour = 'grey', linestyle = 2) +
geom_point(data = regions, aes(x = x, y = y, group = NULL, size = Freq), color = "red") +
geom_text(data = regions, aes(x = x, y = y, label = state.abb, group = NULL), size = 3, color = "blue") +
coord_map("polyconic") +
theme_bw()
# distribution in regions: which region has more startups?
# get number of startups in each region
startups_by_regions <- as.data.frame(table(us_startups$region))
# plot the graph
p7 <- ggplot(aes(x = Var1, y = Freq), data = startups_by_regions) +
theme_bw() +
scale_y_log10()
p7 + geom_point(aes(size = Freq), color = "pink") +
geom_text(aes(label = Var1, size = Freq))
# distribution in cities: which city has more startups?
# get number of startups in each city
startups_by_cities <- as.data.frame(table(us_startups$city))
# creat graph
p8 <- ggplot(aes(x = Var1, y = Freq), data = startups_by_cities) +
theme_bw() +
scale_y_log10()
p8 + geom_point(aes(size = Freq), color = "pink") +
geom_text(aes(label = Var1, size = Freq))
# pie chart of number of startups in different status
p9 <- ggplot(aes(x = factor(1), fill = status), data = subset(us_startups, status != ""))
p9_1 <- p9 + geom_bar() +
coord_polar(theta = "y")
p9_1
# pie chart split by states
p9_1 + facet_wrap(~state_code)
# split pie chart by founded year
p9_1 + facet_wrap(~founded_year)
# get subset data of markets with number of startups > 500
sub_markets <- subset(markets, Freq > 500 & !Var1 == "")
c_market <- c(as.character(sub_markets$Var1))
# list hot market for startups
c_market
## [1] " Advertising " " Biotechnology "
## [3] " Curated Web " " E-Commerce "
## [5] " Enterprise Software " " Hardware + Software "
## [7] " Health and Wellness " " Health Care "
## [9] " Mobile " " Software "
hot_startups <- subset(us_startups, market %in% c(" Advertising ", " Biotechnology ", " Curated Web ", " E-Commerce "," Enterprise Software ", " Hardware + Software ", " Health and Wellness ", " Health Care ", " Mobile ", " Software " )&!is.na(funding_total_usd))
# draw scatter plot
p10 <- ggplot(aes(x = market, y = funding_total_usd), data = hot_startups) + scale_y_log10()
p10 + geom_point(aes(color = market))
# draw a boxplot
p10_1 <- p10 + geom_boxplot(aes(fill = market))
p10_1
# adjust y scale to better compare
p10_2 <- p10_1 + coord_cartesian(ylim = c(1e+05, 1e+08))
p10_2
# initial funding for each type of market
p10_3 <- ggplot(aes(x = market, y = funding_total_usd), data = subset(hot_startups, funding_rounds == 1)) + scale_y_log10()
p10_4 <- p10_3 + geom_point(aes(color = market))
p10_5 <- p10_3 + geom_boxplot(aes(fill = market))
grid.arrange(p10_4, p10_5, ncol = 1)
# mean and median for total fundings
hot_startups.by.market <- hot_startups %>%
group_by(market) %>%
summarise(
mean_fundings = mean(funding_total_usd),
median_fundings = median(funding_total_usd),
mean_rounds = mean(funding_rounds),
median_rounds = median(funding_rounds),
n = n()
) %>%
arrange(desc(mean_fundings))
# output
hot_startups.by.market
## Source: local data frame [10 x 6]
##
## market mean_fundings median_fundings mean_rounds
## 1 Health Care 32480204 9635990 2.783912
## 2 Biotechnology 22261244 4937793 2.423394
## 3 Enterprise Software 20544889 6770000 2.464179
## 4 Advertising 19049650 4057165 2.377358
## 5 Health and Wellness 16134642 1837712 1.808824
## 6 E-Commerce 14686101 2000000 2.059041
## 7 Hardware + Software 12980316 2991204 2.047809
## 8 Software 11859619 3000000 1.951524
## 9 Mobile 11802490 2769850 2.246134
## 10 Curated Web 7873856 1400000 1.822257
## Variables not shown: median_rounds (int), n (int)
# mean and median for initial fundings
hot_startups.by.market.ini <- subset(hot_startups, funding_rounds == 1) %>%
group_by(market) %>%
summarise(
mean_fundings = mean(funding_total_usd),
median_fundings = median(funding_total_usd),
mean_rounds = mean(funding_rounds),
median_rounds = median(funding_rounds),
n = n()
) %>%
arrange(desc(mean_fundings))
# output
hot_startups.by.market.ini
## Source: local data frame [10 x 6]
##
## market mean_fundings median_fundings mean_rounds
## 1 Health and Wellness 15253212 829972 1
## 2 Enterprise Software 13037627 2400000 1
## 3 Health Care 10018856 1500000 1
## 4 Biotechnology 9672169 1500000 1
## 5 Mobile 4826191 1000000 1
## 6 Hardware + Software 4811947 1067000 1
## 7 Software 4230813 1038548 1
## 8 Advertising 3985271 1000000 1
## 9 E-Commerce 3525078 500000 1
## 10 Curated Web 2568273 500000 1
## Variables not shown: median_rounds (dbl), n (int)
# scatter plot for total fundings each year
p11 <- ggplot(aes(x = founded_year, y = funding_total_usd), data = subset(us_startups, !is.na(funding_total_usd))) + scale_y_log10()
p11_1 <- p11 + geom_point(aes(color = founded_year))
# add a conditional mean
p11_2 <- p11_1 + stat_summary(fun.y = mean, geom = "line", color = "red")
# add a conditional median
p11_3 <- p11_1 + stat_summary(fun.y = median, geom = "line", color = "green")
# plot in a row
grid.arrange(p11_2, p11_3, ncol = 1)
# scatter plot for initial fundings each year
p11_3 <- ggplot(aes(x = founded_year, y = funding_total_usd), data = subset(us_startups, !is.na(funding_total_usd)&funding_rounds == 1)) + scale_y_log10()
p11_4 <- p11_3 + geom_point(aes(color = founded_year))
# add a conditional mean
p11_5 <- p11_4 + stat_summary(fun.y = mean, geom = "line", color = "red")
# add a conditional median
p11_6 <- p11_4 + stat_summary(fun.y = median, geom = "line", color = "green")
# plot in a row
grid.arrange(p11_5, p11_6, ncol = 1)
# scatter plot for total fundings each state
p12 <- ggplot(aes(x = state_code, y = funding_total_usd), data = subset(us_startups, !is.na(funding_total_usd))) + scale_y_log10()
#draw scatterplot
p12_1 <- p12 + geom_point(aes(color = state_code))
p12_1 <- p12_1 + theme(legend.position = "none", axis.text=element_text(size=6))
# draw boxplot
p12_2 <- p12_1 + geom_boxplot (aes(fill = state_code))
# arrange in a column
grid.arrange(p12_1, p12_2, ncol = 1)
# scatter plot for initial funding each state
p12_3 <- ggplot(aes(x = state_code, y = funding_total_usd), data = subset(us_startups, !is.na(funding_total_usd)&funding_rounds == 1)) + scale_y_log10()
#draw scatterplot
p12_4 <- p12 + geom_point(aes(color = state_code))
p12_4 <- p12_4 + theme(legend.position = "none", axis.text=element_text(size=6))
# draw boxplot
p12_5 <- p12_4 + geom_boxplot (aes(fill = state_code))
# arrange in a column
grid.arrange(p12_4, p12_5, ncol = 1)
# scatter plot and boxplot for total fundings by status of startups
p13 <- ggplot(aes(x = status, y = funding_total_usd), data = subset(us_startups, !is.na(funding_total_usd) & status != "")) + scale_y_log10()
p13_1 <- p13 + geom_point(aes(color = status))
# boxplot
p13_2 <- p13_1 + geom_boxplot(aes(fill = status))
# draw two plots in a row
grid.arrange(p13_1, p13_2, ncol = 1)
# scatter plot for initial funding
p13_3 <- ggplot(aes(x = status, y = funding_total_usd), data = subset(us_startups, !is.na(funding_total_usd) & status != "" & funding_rounds== 1)) + scale_y_log10()
p13_4 <- p13_3 + geom_point(aes(color = status))
# box plot for initial funding
p13_5 <- p13_4 + geom_boxplot(aes(fill = status))
# draw two plots in a row
grid.arrange(p13_4, p13_5, ncol = 1)
# ANOVA analyses for total fundings
fit <- aov(funding_total_usd ~ founded_year + state_code + region + city + status, data = us_startups)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## founded_year 1 2.203e+18 2.203e+18 302.219 < 2e-16 ***
## state_code 51 6.414e+17 1.258e+16 1.726 0.00101 **
## region 190 5.459e+17 2.873e+15 0.394 1.00000
## city 1536 6.429e+18 4.185e+15 0.574 1.00000
## status 3 1.176e+17 3.921e+16 5.379 0.00107 **
## Residuals 17673 1.288e+20 7.288e+15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 3433 observations deleted due to missingness
# ANOVA analyses for initial fundings
fit_ini <- aov(funding_total_usd ~ founded_year + state_code + region + city + status, data = subset(us_startups, funding_rounds == 1))
summary(fit_ini)
## Df Sum Sq Mean Sq F value Pr(>F)
## founded_year 1 2.175e+17 2.175e+17 80.403 <2e-16 ***
## state_code 51 9.924e+16 1.946e+15 0.719 0.934
## region 183 4.995e+17 2.729e+15 1.009 0.453
## city 1219 3.026e+18 2.482e+15 0.918 0.974
## status 3 1.016e+16 3.387e+15 1.252 0.289
## Residuals 7923 2.143e+19 2.705e+15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 3249 observations deleted due to missingness
# draw scatter plot
p14 <- ggplot(aes(x = founded_year, y = funding_total_usd), data = hot_startups) + scale_y_log10()
p14_1 <- p14 + geom_point(aes(color = market))
# add mean to each market (line)
p14_2 <- p14_1 + stat_summary(fun.y = mean, geom = "line", aes(color = market))
p14_2
# draw line plot
p15 <- ggplot(aes(x = state_code, y = funding_total_usd), data = hot_startups) + scale_y_log10() + theme(axis.text=element_text(size=6))
p15_1 <- p15 + geom_line(aes(color = market))
# add conditional mean to each market (point)
p15_2 <- p15_1 + stat_summary(fun.y = mean, geom = "point", size = 5, aes(color = market))
p15_2
# draw line plot
p16 <- ggplot(aes(x = status, y = funding_total_usd), data = subset(hot_startups, status != "")) + scale_y_log10()
p16_1 <- p16 + geom_line(aes(color = market))
# add conditional mean to each market (point)
p16_2 <- p16_1 + stat_summary(fun.y = mean, geom = "point", size = 5, aes(color = market))
p16_2
# draw a scatter plot
p17 <- ggplot(aes(x = funding_rounds, y = funding_total_usd), data = hot_startups) + scale_y_log10()
p17_1 <- p17 + geom_point(aes(color = market))
# add conditional mean to each market (line)
p17_2 <- p17_1 + stat_summary(fun.y = mean, geom = "line", aes(color = market))
p17_2
# load library
suppressMessages(library(plyr))
# sum total fundings by market by status
total_funding_by_status_by_market <- subset(hot_startups, status != "", select = c("market", "status", "funding_total_usd")) %>%
group_by(market, status)
total_funding_by_status_by_market <- ddply(total_funding_by_status_by_market, c("market", "status"), summarize, fundings = sum(funding_total_usd))
# count number of startups in each status by each market
number_of_startups_by_status <- count(subset(hot_startups, status != ""), c("market","status"))
# merge total fundings to number of startups data frame
number_of_startups_by_status <- merge(number_of_startups_by_status, total_funding_by_status_by_market, by = c("market", "status"))
# draw a scatter plot to reflect number of startup in each status by market, the size of point represents amount of total fundings
ggplot(aes(x = status, y = freq), data = number_of_startups_by_status) +
scale_size_area(max_size = 20) +
scale_y_log10() +
geom_point(aes(color = market, size = fundings), position = position_jitter(w = 0.4))
# Histogram of total fundings for startups with diferent status
p1 <- ggplot(aes(x = funding_total_usd), data = subset(us_startups, status != ""))
p1 + geom_histogram(aes(x = funding_total_usd, y = ..density.., fill = status), binwidth = 0.2) +
theme(axis.text=element_text(size=10)) +
labs(x = "Total fundings ($USD)", y = "Count") +
ggtitle("Histogram of total fundings for startups ($USD, log10)") +
scale_x_log10(breaks = c(1e+04, 1e+06, 1e+08)) +
facet_wrap(~ status)
# add title and labs
p10 <- ggplot(aes(x = market, y = funding_total_usd), data = hot_startups) + scale_y_log10() +
theme(axis.text=element_text(size=8)) +
ggtitle("Fundings for each market of hot startups ($USD, log10)") +
labs(x = "Market of startups", y = "Total fundings ($USD)") +
geom_point(aes(color = market))
# draw a boxplot
p10_1 <- p10 + geom_boxplot(aes(fill = market))
p10_1
# draw line plot and add titles and labs
p15 <- ggplot(aes(x = state_code, y = funding_total_usd), data = hot_startups) +
scale_y_log10() +
theme(axis.text=element_text(size=8)) +
ggtitle("Fundings ($USD, log10) for each market of hot startups in each state") +
labs(x = "States", y = "Total fundings ($USD)")
p15_1 <- p15 + geom_line(aes(color = market))
# add conditional mean to each market (point)
p15_2 <- p15_1 + stat_summary(fun.y = mean, geom = "point", size = 5, aes(color = market))
p15_2
# sum total fundings by market by status
total_funding_by_status_by_market <- subset(hot_startups, status != "", select = c("market", "status", "funding_total_usd")) %>%
group_by(market, status)
total_funding_by_status_by_market <- ddply(total_funding_by_status_by_market, c("market", "status"), summarize, fundings = sum(funding_total_usd))
# count number of startups in each status by each market
number_of_startups_by_status <- count(subset(hot_startups, status != ""), c("market","status"))
# merge total fundings to number of startups data frame
number_of_startups_by_status <- merge(number_of_startups_by_status, total_funding_by_status_by_market, by = c("market", "status"))
# draw a scatter plot to reflect number of startup in each status by market, the size of point represents amount of total fundings
ggplot(aes(x = status, y = freq), data = number_of_startups_by_status) +
scale_y_log10() +
geom_point(aes(color = market, size = fundings), position = position_jitter(w = 0.4)) +
scale_size_area(max_size = 20) +
theme(axis.text=element_text(size=10)) +
ggtitle("Number of hot startups (log10) for each market in each status") +
labs(x = "Status", y = "Number of hot startups (log10)")